\relax 
\ifx\hyper@anchor\@undefined
\global \let \oldcontentsline\contentsline
\gdef \contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
\global \let \oldnewlabel\newlabel
\gdef \newlabel#1#2{\newlabelxx{#1}#2}
\gdef \newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\let \contentsline\oldcontentsline
\let \newlabel\oldnewlabel}
\else
\global \let \hyper@last\relax 
\fi

\providecommand*\HyPL@Entry[1]{}
\citation{conservation-cores}
\citation{isca11-darksilicon}
\citation{ieeemicro-sprinting}
\citation{Dennard1974}
\citation{NTC-UMich}
\citation{steepslope}
\citation{mookerjea}
\citation{ionescu-nems}
\citation{ionescu-3D}
\citation{yibo-yield-iccad}
\citation{mookerjea}
\citation{seabaugh}
\citation{dac11}
\citation{yibo-yield-iccad}
\citation{reetu-3d-cost}
\HyPL@Entry{0 << /S /D >> }
\newlabel{sec:abstract}{{}{1}{\relax }{}{}}
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}}
\newlabel{sec:introduction}{{1}{1}{Introduction\relax }{section.1}{}}
\citation{taylor-dac2012}
\citation{emma-3d}
\citation{codes12}
\@writefile{toc}{\contentsline {section}{\numberline {2}Motivation}{2}{section.2}}
\newlabel{sec:motivation}{{2}{2}{Motivation\relax }{section.2}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Thermal constraints on processor execution}{2}{subsection.2.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Yield constraints on processor design}{2}{subsection.2.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Opportunities with TFET processors}{2}{subsection.2.3}}
\citation{Lu-tfet-scaling}
\citation{czornomaz-iedm13}
\citation{rooyackers-iedm13}
\citation{tomioka-iedm13}
\citation{iedm12}
\citation{tcad-sentaurus}
\citation{codes12}
\citation{mcpat}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces  a) and b) Demonstration of yield and thermal limits on performance scaling in the frequency (X) domain and the parallelism (Y) domain for a well scaling application (\emph  {barnes}) and poorly scaling application (\emph  {ocean.ncont}) }}{3}{figure.1}}
\newlabel{fig:building-collapse-motivation}{{1}{3}{\label {fig:building-collapse-motivation} a) and b) Demonstration of yield and thermal limits on performance scaling in the frequency (X) domain and the parallelism (Y) domain for a well scaling application (\emph {barnes}) and poorly scaling application (\emph {ocean.ncont}) \relax }{figure.1}{}}
\@writefile{toc}{\contentsline {section}{\numberline {3}Modeling TFET-based processors}{3}{section.3}}
\newlabel{sec:background}{{3}{3}{Modeling TFET-based processors\relax }{section.3}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Basic background on Steep-slope devices}{3}{subsection.3.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Extrapolation to processor model}{3}{subsection.3.2}}
\citation{dac11}
\citation{Lu-tfet-scaling}
\citation{itrs2011}
\citation{tcad-sentaurus}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces  a) and b) Fraction of wire power and wire delay to total core power and delay respectively.}}{4}{figure.2}}
\newlabel{fig:wire-power-delay}{{2}{4}{\label {fig:wire-power-delay} a) and b) Fraction of wire power and wire delay to total core power and delay respectively}{figure.2}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces  Variation of CMOS and TFET core power with frequency for a simple (Atom-like) core and a complex (Ivybridge-like) core. The crossover point is seen to shift to the left as core complexity increases. }}{4}{figure.3}}
\newlabel{fig:crossover}{{3}{4}{\label {fig:crossover} Variation of CMOS and TFET core power with frequency for a simple (Atom-like) core and a complex (Ivybridge-like) core. The crossover point is seen to shift to the left as core complexity increases. \relax }{figure.3}{}}
\@writefile{toc}{\contentsline {section}{\numberline {4}Design Space Analysis}{4}{section.4}}
\newlabel{sec:technique}{{4}{4}{Design Space Analysis\relax }{section.4}{}}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Configuration of the evaluation platform.}}{4}{table.1}}
\newlabel{tab:design-space}{{1}{4}{Configuration of the evaluation platform}{table.1}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Technology variation}{4}{subsection.4.1}}
\citation{yibo-yield-iccad}
\citation{yibo-yield-iccad}
\citation{emma-3d}
\citation{tfet-intel}
\citation{tfet-sram}
\citation{microfluidic-cooling}
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces  Variation of total critical path delay (logic + wire delay) in FinFET and TFET processors at 22, 14 and 10\nobreakspace  {}nm technology nodes. Scaling is demonstrated both for the ITRS roadmap projections and TCAD simulations for FinFET and TFET.}}{5}{figure.4}}
\newlabel{fig:technology-scaling3a}{{4}{5}{\label {fig:technology-scaling3a} Variation of total critical path delay (logic + wire delay) in FinFET and TFET processors at 22, 14 and 10~nm technology nodes. Scaling is demonstrated both for the ITRS roadmap projections and TCAD simulations for FinFET and TFET}{figure.4}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces  Variation of total core power (including logic and wire power) in a CMOS and TFET processor at 22\nobreakspace  {}nm, 14\nobreakspace  {}nm and 10\nobreakspace  {}nm technology nodes. }}{5}{figure.5}}
\newlabel{fig:technology-scaling3b}{{5}{5}{\label {fig:technology-scaling3b} Variation of total core power (including logic and wire power) in a CMOS and TFET processor at 22~nm, 14~nm and 10~nm technology nodes. \relax }{figure.5}{}}
\newlabel{fig:figure2}{{5}{5}{\label {fig:technology-scaling3b} Variation of total core power (including logic and wire power) in a CMOS and TFET processor at 22~nm, 14~nm and 10~nm technology nodes. \relax }{figure.5}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Yield aware stacking of processors}{5}{subsection.4.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Modeling thermal distribution across multicores}{5}{subsection.4.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Variation in microarchitecture}{5}{subsection.4.4}}
\citation{sniper}
\citation{mcpat}
\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces  Number of core layers required to realize a range of functioning cores for different area footprints. The fraction of redundant cores can be seen to increase both with area and with number of layers}}{6}{figure.6}}
\newlabel{fig:useful-cores-sampled}{{6}{6}{\label {fig:useful-cores-sampled} Number of core layers required to realize a range of functioning cores for different area footprints. The fraction of redundant cores can be seen to increase both with area and with number of layers\relax }{figure.6}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces  a) and b) Delineation of design space attainable by CMOS(red), TFET(blue), both (green) and neither (black) cores to obtain peak performance, for a scalable (\emph  {barnes}) and non-scalable (\emph  {ocean.ncont}) application respectively. The best performance is seen in the TFET configuration in \emph  {barnes} and in the CMOS configuration in \emph  {ocean.ncont}.}}{6}{figure.7}}
\newlabel{fig:building-collapse-result}{{7}{6}{\label {fig:building-collapse-result} a) and b) Delineation of design space attainable by CMOS(red), TFET(blue), both (green) and neither (black) cores to obtain peak performance, for a scalable (\emph {barnes}) and non-scalable (\emph {ocean.ncont}) application respectively. The best performance is seen in the TFET configuration in \emph {barnes} and in the CMOS configuration in \emph {ocean.ncont}}{figure.7}{}}
\@writefile{toc}{\contentsline {section}{\numberline {5}System Infrastructure and Simulation Tools}{6}{section.5}}
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Configuration of the evaluation platform.}}{6}{table.2}}
\newlabel{tab:syspara}{{2}{6}{Configuration of the evaluation platform}{table.2}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Architectural simulation setup and benchmarks}{6}{subsection.5.1}}
\citation{hotspot}
\citation{ieeemicro-llano}
\citation{hotspot3d}
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Technology parameters.}}{7}{table.3}}
\newlabel{tab:device-params}{{3}{7}{Technology parameters}{table.3}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Modeling thermal variation}{7}{subsection.5.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Scheduling diverse workloads on a stacked CMOS-TFET multicore}{7}{subsection.5.3}}
\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces  Variation in thermal hotspots on a layer of a system of 3D stacked cores with shared L3 cache.}}{7}{figure.8}}
\newlabel{fig:hotspot-profile}{{8}{7}{\label {fig:hotspot-profile} Variation in thermal hotspots on a layer of a system of 3D stacked cores with shared L3 cache}{figure.8}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces  Characterization of \emph  {SPEC CPU2006}, \emph  {Parsec} and \emph  {Splash2} based on scalability and memory utilization }}{7}{figure.9}}
\newlabel{fig:workloads-mpki}{{9}{7}{\label {fig:workloads-mpki} Characterization of \emph {SPEC CPU2006}, \emph {Parsec} and \emph {Splash2} based on scalability and memory utilization \relax }{figure.9}{}}
\citation{parsec}
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Configuration of the evaluation platform.}}{8}{table.4}}
\newlabel{tab:workloads}{{4}{8}{Configuration of the evaluation platform}{table.4}{}}
\newlabel{sec:methodology}{{5.3}{8}{Scheduling diverse workloads on a stacked CMOS-TFET multicore\relax }{table.4}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces  Different operating states of the heterogeneous multicore: a) 2 highly scalable parallel applications scheduled on the entire multicore. b) 2 completely sequential applications scheduled exclusively on CMOS cores. c) A sequential application, scheduled on a CMOS core, running alongside a weakly scaling application. The latter is scheduled on either the remaining CMOS cores or TFET cores depending its optimal configuration. d) A sequential application, scheduled on a CMOS core, running alongside a highly parallel application scheduled on the entire set of TFET cores.}}{8}{figure.10}}
\newlabel{fig:blockdig-3d}{{10}{8}{\label {fig:blockdig-3d} Different operating states of the heterogeneous multicore: a) 2 highly scalable parallel applications scheduled on the entire multicore. b) 2 completely sequential applications scheduled exclusively on CMOS cores. c) A sequential application, scheduled on a CMOS core, running alongside a weakly scaling application. The latter is scheduled on either the remaining CMOS cores or TFET cores depending its optimal configuration. d) A sequential application, scheduled on a CMOS core, running alongside a highly parallel application scheduled on the entire set of TFET cores}{figure.10}{}}
\@writefile{toc}{\contentsline {section}{\numberline {6}Results}{8}{section.6}}
\newlabel{sec:results}{{6}{8}{Results\relax }{section.6}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Optimal operating points in the design space}{8}{subsection.6.1}}
\@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces  a) and b) Mean speedup of different applications in the \emph  {splash2} and \emph  {parsec} suites respectively. The \emph  {splash2} applications, on average, prefer higher frequency and fewer cores to operate (red CMOS space and green common space). This is because $\sim $29\% of applications are relatively poor scaling. On the other hand, \emph  {parsec} benchmarks operate most efficiently on larger number of cores and lower frequencies (blue TFET space and green common space), with only $\sim $17\% applications preferring high frequency CMOS cores}}{9}{figure.11}}
\newlabel{fig:building-collapse-workloads}{{11}{9}{\label {fig:building-collapse-workloads} a) and b) Mean speedup of different applications in the \emph {splash2} and \emph {parsec} suites respectively. The \emph {splash2} applications, on average, prefer higher frequency and fewer cores to operate (red CMOS space and green common space). This is because $\sim $29\% of applications are relatively poor scaling. On the other hand, \emph {parsec} benchmarks operate most efficiently on larger number of cores and lower frequencies (blue TFET space and green common space), with only $\sim $17\% applications preferring high frequency CMOS cores\relax }{figure.11}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces  a) and b) Mean power savings obtained in \emph  {parsec} and \emph  {splash2} suites respectively by running workloads on CMOS (red region) or TFET (blue region)cores. This plot is restricted only to the region where both CMOS and TFET cores are capable of operation in order to determine the more efficient core.}}{9}{figure.12}}
\newlabel{fig:power-workloads}{{12}{9}{\label {fig:power-workloads} a) and b) Mean power savings obtained in \emph {parsec} and \emph {splash2} suites respectively by running workloads on CMOS (red region) or TFET (blue region)cores. This plot is restricted only to the region where both CMOS and TFET cores are capable of operation in order to determine the more efficient core}{figure.12}{}}
\@writefile{lot}{\contentsline {table}{\numberline {5}{\ignorespaces Best performing configuration for each workload}}{9}{table.5}}
\newlabel{tab:bestconfig}{{5}{9}{Best performing configuration for each workload\relax }{table.5}{}}
\citation{ionescu-nems}
\citation{islped11}
\citation{codes12}
\citation{ieeemicro-tfet}
\@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces  Relative performances of 3D stacked CMOS and TFET configurations using an 8 stacked layers consisting of 64 functioning 4 issue processors. The thermal budget assumed here is 87$^\circ $C.}}{10}{figure.13}}
\newlabel{fig:speedup-4issue-notext}{{13}{10}{\label {fig:speedup-4issue-notext} Relative performances of 3D stacked CMOS and TFET configurations using an 8 stacked layers consisting of 64 functioning 4 issue processors. The thermal budget assumed here is 87$^\circ $C}{figure.13}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.2}Sensitivity to thermal budget}{10}{subsection.6.2}}
\@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces  Variation of performance improvement of TFET core as opposed to CMOS cores for different thermal limits. Evaluations are carried out separately for \emph  {Splash} and \emph  {PARSEC} benchmark suites}}{10}{figure.14}}
\newlabel{fig:sensitivity-temperature}{{14}{10}{\label {fig:sensitivity-temperature} Variation of performance improvement of TFET core as opposed to CMOS cores for different thermal limits. Evaluations are carried out separately for \emph {Splash} and \emph {PARSEC} benchmark suites\relax }{figure.14}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.3}Sensitivity to microarchitecture}{10}{subsection.6.3}}
\@writefile{lof}{\contentsline {figure}{\numberline {15}{\ignorespaces  Comparison of performance speedup of CMOS and TFET cores for different microarchitectural configurations. Evaluations are carried out separately for \emph  {Splash2} and \emph  {Parsec} benchmark suites for issue widths of 1 to 8}}{10}{figure.15}}
\newlabel{fig:sensitivity-issue-width2}{{15}{10}{\label {fig:sensitivity-issue-width2} Comparison of performance speedup of CMOS and TFET cores for different microarchitectural configurations. Evaluations are carried out separately for \emph {Splash2} and \emph {Parsec} benchmark suites for issue widths of 1 to 8\relax }{figure.15}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.4}Heterogeneity aware scheduling}{10}{subsection.6.4}}
\@writefile{toc}{\contentsline {section}{\numberline {7}Related Work}{10}{section.7}}
\newlabel{sec:related}{{7}{10}{Related Work\relax }{section.7}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {7.1}Architectures for processors in emerging technologies}{10}{subsection.7.1}}
\citation{ionescu-3D}
\citation{tullsen-3D}
\citation{micro03}
\citation{isca04}
\citation{morphcore}
\citation{corefusion}
\citation{corefusion}
\citation{morphcore}
\citation{NTC-UMich}
\citation{ieeemicro-sprinting}
\citation{prometheus}
\citation{thermal-dvfs-asu}
\citation{hhlee-3D-dvfs}
\@writefile{lof}{\contentsline {figure}{\numberline {16}{\ignorespaces  Performance comparison of homogeneous CMOS and TFET multicore with a heterogeneous 3D configuration with HAS, consisting of 1 CMOS layer and 7 TFET layers}}{11}{figure.16}}
\newlabel{fig:scheduling-results}{{16}{11}{\label {fig:scheduling-results} Performance comparison of homogeneous CMOS and TFET multicore with a heterogeneous 3D configuration with HAS, consisting of 1 CMOS layer and 7 TFET layers\relax }{figure.16}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {7.2}3D stacked architectures}{11}{subsection.7.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {7.3}Heterogeneous architectures}{11}{subsection.7.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {7.4}Architectural techniques for power and thermal aware execution}{11}{subsection.7.4}}
\@writefile{toc}{\contentsline {subsection}{\numberline {7.5}Thermal-Aware application mapping on multicores}{11}{subsection.7.5}}
\@writefile{toc}{\contentsline {section}{\numberline {8}Conclusion }{11}{section.8}}
\newlabel{sec:conclusion}{{8}{11}{Conclusion \relax }{section.8}{}}
\citation{bstctl:etal}
\citation{bstctl:nodash}
\citation{bstctl:simpurl}
\bibstyle{IEEEtranS}
\bibdata{references}
\bibcite{tcad-sentaurus}{1}
\bibcite{parsec}{2}
\bibcite{ieeemicro-llano}{3}
\bibcite{sniper}{4}
\bibcite{yibo-yield-iccad}{5}
\bibcite{hhlee-3D-dvfs}{6}
\bibcite{czornomaz-iedm13}{7}
\bibcite{Dennard1974}{8}
\bibcite{tfet-intel}{9}
\bibcite{NTC-UMich}{10}
\bibcite{emma-3d}{11}
\bibcite{isca11-darksilicon}{12}
\bibcite{ionescu-3D}{13}
\bibcite{thermal-dvfs-asu}{14}
\bibcite{tullsen-3D}{15}
\bibcite{hotspot}{16}
\bibcite{ionescu-nems}{17}
\bibcite{corefusion}{18}
\bibcite{itrs2011}{19}
\bibcite{morphcore}{20}
\bibcite{codes12}{21}
\bibcite{micro03}{22}
\bibcite{isca04}{23}
\bibcite{mcpat}{24}
\bibcite{iedm12}{25}
\bibcite{Lu-tfet-scaling}{26}
\bibcite{steepslope}{27}
\bibcite{hotspot3d}{28}
\bibcite{mookerjea}{29}
\bibcite{ieeemicro-sprinting}{30}
\bibcite{rooyackers-iedm13}{31}
\bibcite{tfet-sram}{32}
\bibcite{dac11}{33}
\bibcite{seabaugh}{34}
\bibcite{prometheus}{35}
\bibcite{islped11}{36}
\bibcite{ieeemicro-tfet}{37}
\bibcite{taylor-dac2012}{38}
\bibcite{tomioka-iedm13}{39}
\bibcite{conservation-cores}{40}
\bibcite{reetu-3d-cost}{41}
\bibcite{microfluidic-cooling}{42}
\newlabel{sec:acknowledgments}{{8}{12}{\relax }{section.8}{}}
